import numpy as np
import pandas as pd
import os
import glob
from datetime import datetime, timedelta
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from __future__ import division
pip install chart_studio
Requirement already satisfied: chart_studio in ./opt/anaconda3/lib/python3.9/site-packages (1.1.0) Requirement already satisfied: six in ./opt/anaconda3/lib/python3.9/site-packages (from chart_studio) (1.16.0) Requirement already satisfied: requests in ./opt/anaconda3/lib/python3.9/site-packages (from chart_studio) (2.27.1) Requirement already satisfied: plotly in ./opt/anaconda3/lib/python3.9/site-packages (from chart_studio) (5.6.0) Requirement already satisfied: retrying>=1.3.3 in ./opt/anaconda3/lib/python3.9/site-packages (from chart_studio) (1.3.3) Requirement already satisfied: tenacity>=6.2.0 in ./opt/anaconda3/lib/python3.9/site-packages (from plotly->chart_studio) (8.0.1) Requirement already satisfied: certifi>=2017.4.17 in ./opt/anaconda3/lib/python3.9/site-packages (from requests->chart_studio) (2021.10.8) Requirement already satisfied: urllib3<1.27,>=1.21.1 in ./opt/anaconda3/lib/python3.9/site-packages (from requests->chart_studio) (1.26.9) Requirement already satisfied: idna<4,>=2.5 in ./opt/anaconda3/lib/python3.9/site-packages (from requests->chart_studio) (3.3) Requirement already satisfied: charset-normalizer~=2.0.0 in ./opt/anaconda3/lib/python3.9/site-packages (from requests->chart_studio) (2.0.4) Note: you may need to restart the kernel to use updated packages.
import chart_studio.plotly as py
import plotly.graph_objs as go
import plotly.offline as pyoff
#files = os.path.join("/Users/hassaniftikhar4472/Downloads/data/cloud_to_local", "*.csv")
#files = glob.glob(files)
#df = pd.concat(map(pd.read_csv, files), ignore_index=True)
#df.columns =['User_ID', 'Date', 'playtime','mnisimp', 'mnrvimp', 'mnisrev', 'mnrvrev', 'mnadrev', 'mniaprev','revtotal']
#df.drop(columns = ['mnrvimp','mnrvrev','mnadrev','mniaprev','revtotal','playtime'], axis=1, inplace=True)
#df['Date'] = pd.to_datetime(df['Date'])
#df.head()
# saving the dataframe
df = pd.read_csv('/Users/hassaniftikhar4472/Downloads/data/data_3_months.csv')
tx_data = df
tx_data['Date'] = pd.to_datetime(df['Date'])
#creating YearMonth field for the ease of reporting and visualization
tx_data['Date'] = tx_data['Date'].map(lambda date: 100*date.month + date.day)
#calculate Revenue for each row and create a new dataframe with YearMonth - Revenue columns
tx_data['Revenue'] = tx_data['mnisrev']
tx_revenue = tx_data.groupby(['Date'])['Revenue'].sum().reset_index()
tx_revenue
| Date | Revenue | |
|---|---|---|
| 0 | 801 | 18103.157642 |
| 1 | 802 | 17830.293629 |
| 2 | 803 | 17963.880245 |
| 3 | 804 | 18013.373888 |
| 4 | 805 | 17879.153577 |
| ... | ... | ... |
| 64 | 1004 | 15850.369094 |
| 65 | 1005 | 15669.182535 |
| 66 | 1006 | 15369.523638 |
| 67 | 1007 | 15243.490637 |
| 68 | 1008 | 15383.735483 |
69 rows × 2 columns
#X and Y axis inputs for Plotly graph. We use Scatter for line graphs
plot_data = [
go.Scatter(
x=tx_revenue['Date'],
y=tx_revenue['Revenue'],
)
]
plot_layout = go.Layout(
xaxis={"type": "category"},
title='Montly Revenue'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
#using pct_change() function to see monthly percentage change
tx_revenue['MonthlyGrowth'] = tx_revenue['Revenue'].pct_change()
#showing first 5 rows
tx_revenue.head()
#visualization - line graph
plot_data = [
go.Scatter(
x=tx_revenue.query("Date < 1008")['Date'],
y=tx_revenue.query("Date < 1008")['MonthlyGrowth'],
)
]
plot_layout = go.Layout(
xaxis={"type": "category"},
title='Montly Growth Rate'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
#creating monthly active customers dataframe by counting unique Customer IDs
tx_monthly_active = tx_data.groupby('Date')['User_ID'].nunique().reset_index()
#plotting the output
plot_data = [
go.Bar(
x=tx_monthly_active['Date'],
y=tx_monthly_active['User_ID'],
)
]
plot_layout = go.Layout(
xaxis={"type": "category"},
title='Monthly Active Users'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
#create a new dataframe for no. of INT by using quantity field
tx_monthly_sales = tx_data.groupby('Date')['mnisimp'].sum().reset_index()
#plot
plot_data = [
go.Bar(
x=tx_monthly_sales['Date'],
y=tx_monthly_sales['mnisimp'],
)
]
plot_layout = go.Layout(
xaxis={"type": "category"},
title='Monthly Total # INT'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
# create a new dataframe for average revenue by taking the mean of it
tx_monthly_order_avg = tx_data.groupby('Date')['Revenue'].mean().reset_index()
#print the dataframe
tx_monthly_order_avg
#plot the bar chart
plot_data = [
go.Bar(
x=tx_monthly_order_avg['Date'],
y=tx_monthly_order_avg['Revenue'],
)
]
plot_layout = go.Layout(
xaxis={"type": "category"},
title='Monthly INT Average'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
tx_uk = tx_data
#create a dataframe contaning CustomerID and first purchase date
tx_min_purchase = tx_uk.groupby('User_ID').Date.min().reset_index()
tx_min_purchase.columns = ['User_ID','MinINTDate']
#tx_min_purchase['MinPurchaseYearMonth'] = tx_min_purchase['MinPurchaseDate'].map(lambda date: 100*date.year + date.month)
#merge first purchase date column to our main dataframe (tx_uk)
tx_uk = pd.merge(tx_uk, tx_min_purchase, on='User_ID')
print(tx_uk.head())
#create a column called User Type and assign Existing
#if User's First Purchase Year Month before the selected Invoice Year Month
tx_uk['UserType'] = 'New'
tx_uk.loc[tx_uk['Date']>tx_uk['MinINTDate'],'UserType'] = 'Existing'
#calculate the Revenue per month for each user type
tx_user_type_revenue = tx_uk.groupby(['Date','UserType'])['Revenue'].sum().reset_index()
User_ID Date playtime mnisimp mnisrev \
0 0fb1ba8c-e46d-4b08-9ace-f0ea12af0e01 912 0 0 0.000000
1 0fb1ba8c-e46d-4b08-9ace-f0ea12af0e01 909 0 0 0.000000
2 0fb1ba8c-e46d-4b08-9ace-f0ea12af0e01 910 0 0 0.000000
3 ed17d241-fa4c-4de3-b3f3-b22d97c0c29b 912 0 0 0.000000
4 ed17d241-fa4c-4de3-b3f3-b22d97c0c29b 905 0 1 0.002967
Revenue MinINTDate
0 0.000000 909
1 0.000000 909
2 0.000000 909
3 0.000000 903
4 0.002967 903
tx_user_type_revenue
| Date | UserType | Revenue | |
|---|---|---|---|
| 0 | 801 | New | 18050.743430 |
| 1 | 802 | Existing | 11616.243356 |
| 2 | 802 | New | 6161.086208 |
| 3 | 803 | Existing | 12979.809498 |
| 4 | 803 | New | 4933.343764 |
| ... | ... | ... | ... |
| 116 | 928 | New | 2214.006095 |
| 117 | 929 | Existing | 13980.201803 |
| 118 | 929 | New | 2202.078237 |
| 119 | 930 | Existing | 13863.189135 |
| 120 | 930 | New | 2158.099729 |
121 rows × 3 columns
#filtering the dates and plot the result
tx_user_type_revenue = tx_user_type_revenue.query("Date != 801 and Date != 1008 and Date != 1007 and Date != 1006 and Date != 1005 and Date != 1004 and Date != 1003 and Date != 1002 and Date != 1001")
plot_data = [
go.Scatter(
x=tx_user_type_revenue.query("UserType == 'Existing'")['Date'],
y=tx_user_type_revenue.query("UserType == 'Existing'")['Revenue'],
name = 'Existing'
),
go.Scatter(
x=tx_user_type_revenue.query("UserType == 'New'")['Date'],
y=tx_user_type_revenue.query("UserType == 'New'")['Revenue'],
name = 'New'
)
]
plot_layout = go.Layout(
xaxis={"type": "category"},
title='New vs Existing'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
#create a dataframe that shows new user ratio - we also need to drop NA values
tx_user_ratio = tx_uk.query("UserType == 'New'").groupby(['Date'])['User_ID'].nunique()/tx_uk.query("UserType == 'Existing'").groupby(['Date'])['User_ID'].nunique()
tx_user_ratio = tx_user_ratio.reset_index()
tx_user_ratio = tx_user_ratio.dropna()
#print the dafaframe
tx_user_ratio
#plot the result
plot_data = [
go.Bar(
x=tx_user_ratio.query("Date>802 and Date<1001")['Date'],
y=tx_user_ratio.query("Date>802 and Date<1001")['User_ID'],
)
]
plot_layout = go.Layout(
xaxis={"type": "category"},
title='New Customer Ratio'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
tx_user_ratio
| Date | User_ID | |
|---|---|---|
| 1 | 802 | 0.779520 |
| 2 | 803 | 0.499322 |
| 3 | 804 | 0.402941 |
| 4 | 805 | 0.363951 |
| 5 | 806 | 0.345727 |
| ... | ... | ... |
| 64 | 1004 | 0.135245 |
| 65 | 1005 | 0.135044 |
| 66 | 1006 | 0.129226 |
| 67 | 1007 | 0.132373 |
| 68 | 1008 | 0.144850 |
68 rows × 2 columns
#identify which users are active by looking at their revenue per month
tx_user_purchase = tx_uk.groupby(['User_ID','Date'])['Revenue'].sum().reset_index()
#create retention matrix with crosstab
tx_retention = pd.crosstab(tx_user_purchase['User_ID'], tx_user_purchase['Date']).reset_index()
tx_retention.head()
#create an array of dictionary which keeps Retained & Total User count for each month
months = tx_retention.columns[2:]
retention_array = []
for i in range(len(months)-1):
retention_data = {}
selected_month = months[i+1]
prev_month = months[i]
retention_data['Date'] = int(selected_month)
retention_data['TotalUserCount'] = tx_retention[selected_month].sum()
retention_data['RetainedUserCount'] = tx_retention[(tx_retention[selected_month]>0) & (tx_retention[prev_month]>0)][selected_month].sum()
retention_array.append(retention_data)
#convert the array to dataframe and calculate Retention Rate
tx_retention = pd.DataFrame(retention_array)
tx_retention['RetentionRate'] = tx_retention['RetainedUserCount']/tx_retention['TotalUserCount']
#plot the retention rate graph
plot_data = [
go.Scatter(
x=tx_retention.query("Date<1001")['Date'],
y=tx_retention.query("Date<1001")['RetentionRate'],
name="organic"
)
]
plot_layout = go.Layout(
xaxis={"type": "category"},
title='Monthly Retention Rate'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
tx_retention
| Date | TotalUserCount | RetainedUserCount | RetentionRate | |
|---|---|---|---|---|
| 0 | 803 | 814997 | 460394 | 0.564903 |
| 1 | 804 | 819135 | 460291 | 0.561923 |
| 2 | 805 | 823904 | 457007 | 0.554685 |
| 3 | 806 | 818128 | 444635 | 0.543479 |
| 4 | 807 | 843044 | 446827 | 0.530016 |
| ... | ... | ... | ... | ... |
| 62 | 1004 | 744263 | 418285 | 0.562012 |
| 63 | 1005 | 742684 | 418853 | 0.563972 |
| 64 | 1006 | 741363 | 418218 | 0.564120 |
| 65 | 1007 | 744697 | 417250 | 0.560295 |
| 66 | 1008 | 743989 | 404438 | 0.543607 |
67 rows × 4 columns